Task #1 - Set working directory & import packages

# Setting directory of script if running in RStudio
library(rstudioapi)
if(rstudioapi::isAvailable()){
     path <- rstudioapi::getActiveDocumentContext()$path
     Encoding(path) <- "UTF-8"
     setwd(dirname(path))
}

# Load libraries
library(tidyverse)
library(gridExtra) # to plot multiple ggplot objects
library(scales)
library(lubridate)
library(plotly)

Task #2 - Merge the 12 months of sales data into a single CSV file

# specifies all csv files within our input folder
file_names <- list.files(
     path = "input", 
     pattern = "*.csv",
     full.names = TRUE)

for (i in file_names){
     df <- read_csv((i))
     assign(substr(i, 7, nchar(i)-4) , df)
     rm(df)
}

# listing all the dataframes stored in our global environment
list_dataframes <- Filter(function(x) is(x, "data.frame"), mget(ls()))

# bind into a single dataframe
all_months_data <- do.call(rbind, list_dataframes)

# remove individual monthly sales dataframes (for cleanliness)
rm(list=ls(pattern="Sales_"))

Task #3 - Explore the merged dataset

# view first 8 rows
head(all_months_data, 8)
## # A tibble: 8 × 6
##   `Order ID` Product `Quantity Order… `Price Each` `Order Date` `Purchase Addre…
##   <chr>      <chr>   <chr>            <chr>        <chr>        <chr>           
## 1 176558     USB-C … 2                11.95        04/19/19 08… 917 1st St, Dal…
## 2 <NA>       <NA>    <NA>             <NA>         <NA>         <NA>            
## 3 176559     Bose S… 1                99.99        04/07/19 22… 682 Chestnut St…
## 4 176560     Google… 1                600          04/12/19 14… 669 Spruce St, …
## 5 176560     Wired … 1                11.99        04/12/19 14… 669 Spruce St, …
## 6 176561     Wired … 1                11.99        04/30/19 09… 333 8th St, Los…
## 7 176562     USB-C … 1                11.95        04/29/19 13… 381 Wilson St, …
## 8 176563     Bose S… 1                99.99        04/02/19 07… 668 Center St, …
# view structure of dataframe
str(all_months_data)
## spec_tbl_df [186,850 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Order ID        : chr [1:186850] "176558" NA "176559" "176560" ...
##  $ Product         : chr [1:186850] "USB-C Charging Cable" NA "Bose SoundSport Headphones" "Google Phone" ...
##  $ Quantity Ordered: chr [1:186850] "2" NA "1" "1" ...
##  $ Price Each      : chr [1:186850] "11.95" NA "99.99" "600" ...
##  $ Order Date      : chr [1:186850] "04/19/19 08:46" NA "04/07/19 22:30" "04/12/19 14:38" ...
##  $ Purchase Address: chr [1:186850] "917 1st St, Dallas, TX 75001" NA "682 Chestnut St, Boston, MA 02215" "669 Spruce St, Los Angeles, CA 90001" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Order ID` = col_character(),
##   ..   Product = col_character(),
##   ..   `Quantity Ordered` = col_character(),
##   ..   `Price Each` = col_character(),
##   ..   `Order Date` = col_character(),
##   ..   `Purchase Address` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# count # of NA values in each column
colSums(is.na(all_months_data))
##         Order ID          Product Quantity Ordered       Price Each 
##              545              545              545              545 
##       Order Date Purchase Address 
##              545              545

Task #4 - Data cleaning!

# Rename column headers & convert datatypes
all_months_data <- all_months_data %>% 
     rename(
          Order_ID = `Order ID`,
          Order_Date = `Order Date`,
          Quantity_Ordered = `Quantity Ordered`,
          Price_Each = `Price Each`,
          Purchase_Address = `Purchase Address`) %>% 
     mutate(
          Order_Date_Time = as.POSIXct(Order_Date, format="%m/%d/%y %H:%M", tz=Sys.timezone()), 
          Order_Date = as.Date(Order_Date, format = "%m/%d/%y"),
          Quantity_Ordered = as.numeric(Quantity_Ordered),
          Price_Each = as.numeric(Price_Each))

# Given that we've spotted a couple missing values in our dataset, we'll need to drop any NA values
all_months_data_droppedNAs <- na.omit(all_months_data)

# QA if we've dropped a maximum of 900 rows based on our data exploration discovery in task #3
nrow(all_months_data)-nrow(all_months_data_droppedNAs)
## [1] 903

Task #5 - Add new columns

all_months_data_droppedNAs <- all_months_data_droppedNAs %>% 
     mutate(
          Month = format(Order_Date, "%m"),
          Month = as.numeric(Month), # convert to numeric value
          Sales = Price_Each * Quantity_Ordered,
          Product_Category = case_when(
            str_detect(Product, "Laptop") ~ "Laptop",
            str_detect(Product, "Monitor|TV") ~ "Monitor/TV",
            str_detect(Product, "Charging|Batteries") ~ "Charging Accessories",
            str_detect(Product, "Headphones") ~ "Headphones",
            str_detect(Product, "Phone") ~ "Smartphone",
            str_detect(Product, "Washing|Dryer") ~ "Cleaning",
            TRUE ~ "Other")
          )

# Add a 'City' column by parsing the 'Purchase_Address' column
all_months_data_droppedNAs$City <- str_split_fixed(all_months_data_droppedNAs$Purchase_Address,",", 2)[,2]
all_months_data_droppedNAs$City <- str_split_fixed(all_months_data_droppedNAs$City,",", 2)[,1]

head(all_months_data_droppedNAs)
## # A tibble: 6 × 11
##   Order_ID Product    Quantity_Ordered Price_Each Order_Date Purchase_Address   
##   <chr>    <chr>                 <dbl>      <dbl> <date>     <chr>              
## 1 176558   USB-C Cha…                2       12.0 2019-04-19 917 1st St, Dallas…
## 2 176559   Bose Soun…                1      100.  2019-04-07 682 Chestnut St, B…
## 3 176560   Google Ph…                1      600   2019-04-12 669 Spruce St, Los…
## 4 176560   Wired Hea…                1       12.0 2019-04-12 669 Spruce St, Los…
## 5 176561   Wired Hea…                1       12.0 2019-04-30 333 8th St, Los An…
## 6 176562   USB-C Cha…                1       12.0 2019-04-29 381 Wilson St, San…
## # … with 5 more variables: Order_Date_Time <dttm>, Month <dbl>, Sales <dbl>,
## #   Product_Category <chr>, City <chr>

Question 1: What was the best month for sales?

ggplotly(all_months_data_droppedNAs %>% 
     group_by(Month) %>% 
     summarise(Total_Sales = sum(Sales)) %>% 
     ungroup() %>% 
     ggplot(aes(x = as.factor(Month), y = Total_Sales)) + 
     geom_bar(stat = 'identity', fill = "royal blue") +
          labs(
          title = "December was the best month for sales",
          x = "Month",
          y = "Sales in USD ($ Millions)",
     ) +
     theme_light() +
     geom_text(aes(
          label = paste0(dollar_format()(Total_Sales/1000000),"M")), 
          position = position_stack(vjust = 0.5), 
          size = 5,
          angle = 90,
          color = "white"))

Question 2: What city had the highest number of sales?

ggplotly(all_months_data_droppedNAs %>% 
     group_by(City) %>% 
     summarise(Total_Sales = sum(Sales)) %>% 
     ungroup() %>% 
     ggplot(aes(x = reorder(City, Total_Sales), y = Total_Sales)) + 
     geom_bar(stat = 'identity', fill = "royal blue") +
          labs(
          title = "San Francisco earns the highest number of sales",
          x = "City",
          y = "Sales in USD ($ Millions)",
     ) +
     theme_light() +
     coord_flip() +
     geom_text(aes(
          label = paste0(dollar_format()(Total_Sales/1000000),"M")), 
          position = position_stack(vjust = 0.5), 
          size = 5,
          color = "white"))

Question 4: What products are most often sold together?

ggplotly(all_months_data_droppedNAs %>% 
  group_by(Order_ID) %>% 
  mutate(Order_items = paste0(Product, collapse = ", ")) %>% 
  summarise(
    Order_items,
    Total_Products = n_distinct(Product)) %>% 
  filter(Total_Products > 1) %>% 
  ungroup() %>% 
  group_by(Order_items) %>% 
  summarise(Order_Frequency = n_distinct(Order_ID)) %>% 
  ungroup() %>% 
  arrange(desc(Order_Frequency)) %>% 
  head(20) %>% 
  ggplot(aes(x = reorder(Order_items, Order_Frequency), y = Order_Frequency)) +
  geom_bar(stat = 'identity', fill = "royal blue") +
  coord_flip() +
  labs(
    title = "Smart Phones are typically purchased along with charging cables or audio accessories",
    x = "List of Ordered Products",
    y = "Order Frequency"
  ) +
  theme_light() +
  geom_text(aes(
          label = Order_Frequency), 
          position = position_stack(vjust = 0.5), 
          size = 4,
          color = "white"))

Question 5: Which product generated the highest revenue?

ggplotly(all_months_data_droppedNAs %>% 
  group_by(Product,Product_Category) %>% 
  summarise(Total_Sales = sum(Sales)) %>% 
  ungroup() %>% 
  ggplot(aes(x = reorder(Product, Total_Sales), y = Total_Sales, fill = Product_Category)) +
  geom_bar(stat = 'identity') +
  theme_light() +
  coord_flip() +
  labs(
    title = "Laptops & Smartphones generates the most revenue",
    x = "Product",
    y = "Total Revenue"
  ) +
  geom_text(aes(
          label = paste0(dollar_format()(Total_Sales/1000000),"M")), 
          position = position_stack(vjust = 0.5), 
          size = 5,
          color = "white"))

Question 6: What time should we display advertisements to maximize likelihood of customer’s buying product?

# Add 2 new columns, 'Hour' & 'Minutes' based on the 'Order_Date_Time' column
all_months_data_droppedNAs$Hour <- format(as.POSIXct(all_months_data_droppedNAs$Order_Date_Time), format = "%H")
all_months_data_droppedNAs$Minute <- format(as.POSIXct(all_months_data_droppedNAs$Order_Date_Time), format = "%M")

all_months_data_droppedNAs %>% 
  group_by(City, Hour) %>% 
  summarise(Total_Orders = n_distinct(Order_ID)) %>% 
  ungroup() %>% 
  ggplot(aes(x = as.numeric(Hour), y = Total_Orders, color = City)) +
  geom_line() +
  geom_point() +
  labs(
    title = "Most sales occur between 10AM-1PM or 6PM-8PM",
    x = "Hour",
    y = "# of Orders"
  ) +
  theme_light()